1. Which of the vital signs are actually recorded the most? (e.g. between the different O2 measures)
We would want to use these measurements -- the ones that are most accessible / most often recorded
2. How many measurements are available from the same encounter before a patient"s RRT event? What"s the time frame we have before RRT events?
Use this info to help us shape how we timebox measurements for prediction.
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
import datetime as datetime
from impala.util import as_pandas
from impala.dbapi import connect
In [ ]:
# connect to impala
conn = connect(host=host="mycluster.domain.com", port=my_impala_port_number)
In [ ]:
# Make sure we"re pulling from the right location
cur = conn.cursor()
cur.execute("use my_db")
In [ ]:
query = """
SELECT \
ce.clinical_event_id \
, ce.event_id \
, ce.encntr_id \
, ce.person_id \
, ce.event_cd \
, cv_event_cd.description AS event_description \
, ce.performed_dt_tm AS unix_performed_dt_tm \
, from_unixtime(CAST(ce.performed_dt_tm / 1000 as bigint)) AS performed_dt_tm \
, ce.event_tag \
, ce.result_val \
, cv_result_units_cd.display AS result_units_display \
, ce.result_time_units_cd \
, ce.catalog_cd \
FROM clinical_event ce \
LEFT OUTER JOIN code_value cv_event_cd \
ON ce.event_cd = cv_event_cd.code_value \
LEFT OUTER JOIN code_value cv_result_units_cd \
ON ce.result_units_cd = cv_result_units_cd.code_value \
WHERE ce.encntr_id IN ( SELECT DISTINCT encntr_id \
FROM clinical_event \
WHERE event_cd = '54411998' \
AND result_status_cd NOT IN ('31', '36') \
AND valid_until_dt_tm > unix_timestamp() \
AND event_class_cd not in ('654645') \
ORDER BY RAND() \
LIMIT 10 \
) \
AND ce.event_cd IN ( \
'703306',\
'703501',\
'703511',\
'703516',\
'703540',\
'703558',\
'703565',\
'703569',\
'703960',\
'2700541',\
'2700653',\
'3623994',\
'4674677',\
'4686698',\
'54411998'\
)\
ORDER BY ce.encntr_id, ce.performed_dt_tm;
"""
In [ ]:
cur.execute(query)
df = as_pandas(cur)
In [ ]:
df.head()
In [ ]:
df.encntr_id.value_counts()
In [ ]:
# pull off all the encounter ids into an array to loop over
df.encntr_id.value_counts().index.get_values()
In [ ]:
df_enc = df[df.encntr_id=='108285121']
rrt_times = df_enc.unix_performed_dt_tm[df_enc.event_cd == '54411998'].unique()
In [ ]:
rrt_times
In [ ]:
newdf = df_enc.groupby(['event_cd']).count().iloc[:,0].to_frame()
In [ ]:
newdf
In [ ]:
print newdf.sort_values('clinical_event_id', ascending = False)
In [ ]:
# Function to show how many values of different vitals signs for event_cd appeared in the intervals between RRT events
# for this encounterid.
# Start with a df from clinical_event table, where each line contains either a vital sign reading or an RRT event form
# must include encntr_id, event_cd, unixdatestamp.
def output_vitalsbeforeRRT(df):
'''
Function to show how many values of different vitals signs for event_cd appeared in the intervals between RRT events
for this encounterid Start with a df from clinical_event table, where each line contains either a vital sign reading or
an RRT event form must include encntr_id, event_cd, unixdatestamp
'''
encntr_array = df.encntr_id.value_counts().index.get_values()
# loop through all encounter ids
for encounter_id in encntr_array:
print "Encounter id: {0}".format(encounter_id)
df_enc = df[df.encntr_id==encounter_id]
# find the different unique RRT times
rrt_times = df_enc.unix_performed_dt_tm[df_enc.event_cd == '54411998'].unique()
print rrt_times
# go through each interval & print counts for values
time_prev=0
for time_now in rrt_times:
df_enc_sub = df_enc[(df_enc.unix_performed_dt_tm < time_now) & (df_enc.unix_performed_dt_tm > time_prev)]
print "time_now: {0}; time_prev: {1}".format(time_now, time_prev)
print df_enc_sub.groupby(['event_cd']).count().iloc[:,0].to_frame().sort_values('clinical_event_id', ascending = False)
print "------------------"
time_prev = time_now
print "================================"
return "Done"
In [ ]:
output_vitalsbeforeRRT(df)
In [ ]:
import pandas as pd
In [ ]:
query_vitals = """
SELECT \
ce.clinical_event_id \
, ce.event_id \
, ce.encntr_id
, ce.person_id \
, ce.event_cd \
, cv_event_cd.description AS event_description \
, ce.performed_dt_tm AS unix_performed_dt_tm \
, from_unixtime(CAST(ce.performed_dt_tm / 1000 as bigint)) AS performed_dt_tm \
, ce.event_tag \
, ce.result_val \
, cv_result_units_cd.display AS result_units_display \
, ce.result_time_units_cd \
, ce.catalog_cd \
FROM clinical_event ce \
LEFT OUTER JOIN code_value cv_event_cd \
ON ce.event_cd = cv_event_cd.code_value \
LEFT OUTER JOIN code_value cv_result_units_cd \
ON ce.result_units_cd = cv_result_units_cd.code_value \
WHERE ce.encntr_id ='105479870' \
AND ce.event_cd IN ( \
'703306',\
'703501',\
'703511',\
'703516',\
'703540',\
'703558',\
'703565',\
'703569',\
'2700541',\
'2700653',\
'3623994',\
'4674677',\
'4686698',\
'4690633',\
'54411998',\
'54408578'\
)\
ORDER BY ce.encntr_id, ce.performed_dt_tm;
"""
cur.execute(query_vitals)
df = as_pandas(cur)
In [ ]:
df["timestamp"] = df.performed_dt_tm.apply(lambda x: pd.Timestamp(x))
In [ ]:
# Exploring different codes below
# df[df.event_cd=='54411998']
# df[df.event_cd=='54408578']
# df[df.event_cd=='54408578']['event_tag'].get_values()[0]
print "RRT reason for call: {0}".format(df[df.event_cd=='54408578']['event_tag'].get_values()[0])
In [ ]:
df['result_val'] = pd.to_numeric(df.result_val, errors = 'coerce')
In [ ]:
df.dtypes
In [ ]:
df_new = pd.pivot_table(df, values="result_val", index='timestamp', columns = 'event_description')
In [ ]:
df_new.tail()
In [ ]:
# Grab unique RRT event times:
RRTEventTimes = df.unix_performed_dt_tm[df.event_cd == "54411998"].unique()
RRT_times = [pd.to_datetime(x, unit='ms') for x in RRTEventTimes]
In [ ]:
df_new.columns.get_values()
In [ ]:
test = df_new.columns.get_values()
In [ ]:
colnames = [item for item in test if item not in ["RRT Event Form", "RRT Primary Reason for Call"]]
In [ ]:
colnames
In [ ]:
# pull dates of encounter so we can set xlim up the right way.
query_minmaxtimes = "SELECT arrive_dt_tm, depart_dt_tm FROM encounter WHERE encntr_id = '105479870';"
cur.execute(query_minmaxtimes)
arr_dep = cur.fetchall()
arr = arr_dep[0][0]
dep = arr_dep[0][1]
In [ ]:
arr = arr_dep[0][0]
In [ ]:
dep = arr_dep[0][1]
In [ ]:
print arr; print dep
In [ ]:
arr = pd.to_datetime(arr, unit="ms")
dep = pd.to_datetime(dep, unit='ms')
In [ ]:
# Here is an example of a random patient's vitals over time
for name in colnames:
plt.figure(figsize=(14,6))
mask = np.isfinite(df_new[name])
plt.plot(df_new[mask].index.to_pydatetime(), df_new[name][mask], '-o')
plt.title(name)
plt.tick_params(labelsize=14)
plt.xlim([arr.to_pydatetime(), dep.to_pydatetime()])
# add vertical lines when RRT Events took place
for val in RRT_times:
plt.axvline(x=val.to_pydatetime(), linewidth = 2)
In [ ]:
def timeseries_randomRRTpatient():
'''
Query for & plot the vital signs of a patient with an RRT event.
Currently written to return a random encounter.
'''
query_encid = """SELECT DISTINCT encntr_id \
FROM clinical_event \
WHERE event_cd = '54411998' \
AND result_status_cd NOT IN ('31', '36') \
AND valid_until_dt_tm > unix_timestamp() \
AND event_class_cd not in ('654645');"""
# Note - have note checked that encounter has valid end date...
# And haven't separated by location.
cur.execute(query_encid)
RRT_enc_ids = cur.fetchall()
RRTencids = [enc[0] for enc in RRT_enc_ids]
enc_id = np.random.choice(RRTencids)
query_vitals = """
SELECT \
ce.clinical_event_id \
, ce.event_id \
, ce.encntr_id \
, ce.person_id \
, ce.event_cd \
, cv_event_cd.description AS event_description \
, ce.performed_dt_tm AS unix_performed_dt_tm \
, from_unixtime(CAST(ce.performed_dt_tm / 1000 as bigint)) AS performed_dt_tm \
, ce.event_tag \
, ce.result_val \
, cv_result_units_cd.display AS result_units_display \
, ce.result_time_units_cd \
, ce.catalog_cd \
FROM clinical_event ce \
LEFT OUTER JOIN code_value cv_event_cd \
ON ce.event_cd = cv_event_cd.code_value \
LEFT OUTER JOIN code_value cv_result_units_cd \
ON ce.result_units_cd = cv_result_units_cd.code_value \
WHERE ce.encntr_id = '{0}' \
AND ce.event_cd IN ( \
'703306',\
'703501',\
'703511',\
'703516',\
'703540',\
'703558',\
'703565',\
'703569',\
'2700541',\
'2700653',\
'3623994',\
'4674677',\
'4686698',\
'4690633',\
'54411998',\
'54408578'\
)\
ORDER BY ce.encntr_id, ce.performed_dt_tm;""".format(enc_id)
cur.execute(query_vitals)
df = as_pandas(cur)
df["timestamp"] = df.performed_dt_tm.apply(lambda x: pd.Timestamp(x))
df['result_val'] = pd.to_numeric(df.result_val, errors = 'coerce')
# pivot the data so it's in the format we need for plotting
df_new = pd.pivot_table(df, values="result_val", index='timestamp', columns = 'event_description')
# Grab unique RRT event times & convert to right format
RRTEventTimes = df.unix_performed_dt_tm[df.event_cd == "54411998"].unique()
RRT_times = [pd.to_datetime(x, unit='ms') for x in RRTEventTimes]
# get columns to query
init_colnames = df_new.columns.get_values()
colnames = [item for item in init_colnames if item not in ["RRT Event Form", "RRT Primary Reason for Call"]]
# print encounter id
print "encntr_id: {0}".format(df.encntr_id[0])
print "RRT reason for call: {0}".format(df[df.event_cd=='54408578']['event_tag'].get_values()[0])
# pull up & print reason for visit
query_reason = "SELECT reason_for_visit FROM encounter WHERE encntr_id = '{0}';".format(df.encntr_id[0])
cur.execute(query_reason)
reason = cur.fetchall()
reason = reason[0][0]
print "Reason for encounter: {0}".format(reason)
# pull dates of encounter so we can set xlim up the right way.
query_minmaxtimes = "SELECT arrive_dt_tm, depart_dt_tm FROM encounter WHERE encntr_id = '{0}';".format(df.encntr_id[0])
cur.execute(query_minmaxtimes)
arr_dep = cur.fetchall()
arr = pd.to_datetime(arr_dep[0][0], unit="ms")
dep = pd.to_datetime(arr_dep[0][1], unit="ms")
# plotting: PLOT ALL THE THINGS
for name in colnames:
plt.figure(figsize=(14,8))
mask = np.isfinite(df_new[name])
plt.plot(df_new[mask].index.to_pydatetime(), df_new[name][mask], '-o')
plt.title(name)
plt.tick_params(labelsize=14)
plt.xlim([arr.to_pydatetime(), dep.to_pydatetime()])
# add vertical lines when RRT Events took place
for val in RRT_times:
plt.axvline(x=val.to_pydatetime(), linewidth = 2)
In [ ]:
timeseries_randomRRTpatient()